# -----------------------------------------------------------------------------#  
# Replication code for: 
# 
#  "Enforcing Compulsory Schooling through Credible Coercion: Lessons from
#  Australia’s Northern Territory Intervention", Economic Record, 2018
#
#  Author: Kyle Peyton (kyle.peyton@yale.edu)
# 
#  This script replicates the Figures and regression results for the 
#  diff-in-diff analyses using the NAPLAN data on attendance rates.
# ----------------------------------------------------------------------------# 

# Load required packages
suppressMessages({
  library(tidyverse)
  library(ggthemes)
  library(clusterSEs)
})

# Set theme for ggplot
theme_kyle <- function () { 
  theme_tufte(base_size = 14) %+replace%
    theme(legend.position = "bottom",
          legend.title = element_blank(),
          strip.background = element_blank(),
          panel.border = element_rect(fill = NA, colour = "gray"))
}

# Read in naplan data
naplan <- read_csv("naplan.csv")

#------Figure 2------
read_df <- 
  naplan %>%
  filter(indig == 1, domain == "read") %>% 
  mutate(grade_factor = paste("Grade level: Year", grade))

read_plot <- 
  ggplot() +
  geom_vline(xintercept = 2009, col = "black", lty = 3, size = 1) +
  geom_line(data = read_df %>% filter(jurisdiction != "NT"),
            aes(x = year, y = part_pct, group = jurisdiction, color = "grey"),
            alpha = 0.5) +
  geom_point(data = read_df %>% filter(jurisdiction != "NT"),
             aes(x = year, y = part_pct, group = jurisdiction, color = "grey"),
             alpha = 1, pch = 19) +
  geom_line(data = read_df %>% filter(jurisdiction == "NT"), 
            aes(x = year, y = part_pct, group = jurisdiction, color = "black"),
            alpha = 0.8, size = 1) +
  geom_point(data = read_df %>% filter(jurisdiction == "NT"), 
             aes(x = year, y = part_pct, group = jurisdiction, color = "black"),
             alpha = 1, pch = 19) +
  scale_y_continuous(breaks = seq(0.6, 1, 0.10), limits = c(0.60, 1), 
                     labels = scales::percent_format()) +
  scale_color_manual(name = "", values = c("grey" =  "grey", "black" = "black"),
                     labels = c("Northern Territory", 
                                "Other Australian State/Territory")) +
  ylab("Indigenous student participation rate") + 
  xlab("") + 
  facet_wrap(~ grade_factor) + 
  theme_kyle()

#------Figure 3------
math_df <- 
  naplan %>%
  filter(indig == 1, domain == "math") %>% 
  mutate(grade_factor = paste("Grade level: Year", grade)) 


math_plot <- 
  ggplot() +
  geom_vline(xintercept = 2009, col = "black", lty = 3, size = 1) +
  geom_line(data = math_df %>% filter(jurisdiction != "NT"),
            aes(x = year, y = part_pct, group = jurisdiction, color = "grey"),
            alpha = 0.5) +
  geom_point(data = math_df %>% filter(jurisdiction != "NT"),
             aes(x = year, y = part_pct, group = jurisdiction, color = "grey"),
             alpha = 1, pch = 19) +
  geom_line(data = math_df %>% filter(jurisdiction == "NT"), 
            aes(x = year, y = part_pct, group = jurisdiction, color = "black"),
            alpha = 0.8, size = 1) +
  geom_point(data = math_df %>% filter(jurisdiction == "NT"), 
             aes(x = year, y = part_pct, group = jurisdiction, color = "black"),
             alpha = 1, pch  = 19) +
  scale_y_continuous(breaks = seq(0.6, 1, 0.10), limits = c(0.60, 1), 
                     labels = scales::percent_format()) +
  scale_color_manual(name = "", values = c("grey" =  "grey", "black" = "black"),
                     labels = c("Northern Territory", 
                                "Other Australian State/Territory")) +
  ylab("Indigenous student participation rate") + 
  xlab("") + 
  facet_wrap(~ grade_factor) + 
  theme_kyle()

#------Export graphics----
ggsave("figure2.pdf", read_plot, width = 7, height = 5, device = "pdf")
system("open figure2.pdf")

ggsave("figure3.pdf", math_plot, width = 7, height = 5, device = "pdf")
system("open figure3.pdf")

#------Table 5------

# Subset to indig students
indig <- 
  naplan %>% 
  filter(indig == 1)

# Create the design matrix. 
years <- model.matrix(~ -1 + as.factor(year), data = indig)
colnames(years) <- c("t2008", "t2009", "t2010", "t2011", "t2012")

grades <- model.matrix(~ -1 + as.factor(grade), data = indig)
colnames(grades) <- c("grade3", "grade5", "grade7", "grade9")

juris <- model.matrix(~ -1 + as.factor(jurisdiction), data = indig)
colnames(juris) <- c("ACT", "NSW", "NT", "QLD", "SA", "TAS", "VIC", "WA")

domains <- model.matrix(~ -1 + as.factor(domain), data = indig)
colnames(domains) <- c("math", "read")

indig <- cbind(indig, years, grades, juris, domains)

# Fit the model w/ 2000 bootstrap replications
set.seed(123)
B <- 2000 

# Regression specification for participation rates 
partfit1 <- glm(formula = part_pct ~ NT*t2009 + NT*t2010 + NT*t2011 + NT*t2012 + 
                  NT*t2009 + grade3 + grade5 + grade7 + read, 
                data = indig, weights = total_n)

# Estract coefficients
coef(partfit1)

# Extract R^2
r <- partfit1$residuals 
f <- partfit1$fitted 
w <- partfit1$weights 
if (is.null(w)) { 
  mss <- if (attr(partfit1$terms, "intercept")) 
    sum((f - mean(f))^2) 
  else sum(f^2) 
  rss <- sum(r^2) 
} else { 
  mss <- if (attr(partfit1$terms, "intercept")) { 
    m <- sum(w * f/sum(w)) 
    sum(w * (f - m)^2) 
  } 
  else sum(w * f^2) 
  rss <- sum(w * r^2) 
  r <- sqrt(w) * r 
} 
r.squared <- mss/(mss + rss) 
r.squared

# 95% CIs and P-values 
cpartfit11 <- cluster.wild.glm(partfit1, dat = indig, cluster = ~ jurisdiction, 
                               boot.reps = B, impose.null = FALSE)
cpartfit11
